#Muwei Zheng and Tyler Moore
#Script for analyzing datasets gathered for the paper:
#M. Zheng, H. Robbins, Z. Chai, P. Thapa, T. Moore. "Cybersecurity Research Datasets: Taxonomy and Empirical Analysis," USENIX Workshop on Cyber Security Experimentation and Test (CSET), 2018. USENIX Association

set.seed(8675309)

###################################################
# read data in (datasets and papers)              
###################################################

dt <- read.csv('CyberDatasets.csv')

dt$yearPast <- (2017-dt$Year)
dt$citeNumYr<-dt$citeNum/dt$yearPast


dt$SubCategory<-factor(dt$SubCategory,levels(dt$SubCategory)[c(4,14,8,7,2,6,3,9,10,5,1,11,12,13)])

dt <- dt[dt$Conference %in% c('CCS','IMC','NDSS','USENIX','SP','FC','WEIS', 'CCS-AISEC', 'USENIX-CSET', 'FC-BTW'),]
dt$Conference<-as.character(dt$Conference)
dt$Conference[dt$Conference %in% c("CCS-AISEC","FC-BTW","USENIX-CSET")]<-"Workshops"
dt$yearPast <- (2017-dt$Year)
dt$Conference <- factor(dt$Conference)
dt$Exist <- ifelse(dt$Origin=='Existing', 'Yes', 'No')

dt$CreatedData<-ifelse(dt$Origin=='Existing', F, T)
dt$DataType<-ifelse(dt$Origin=='Existing', "Existing", "Created")
dt$hasData<-T
dt$YearF<-factor(dt$Year)
dt$citeNumYr <- dt$citeNum/dt$yearPast

dt$isPublic<-ifelse(dt$Public=="Yes",T,F)

pap <- read.csv('CyberPapers.csv')

pap$Year <- as.numeric(pap$Year)

pap <- pap[pap$Conference %in% c('CCS','IMC','NDSS','USENIX','SP','FC','WEIS', 'CCS-AISEC', 'USENIX-CSET', 'FC-BTW'),]
pap$Conference<-as.character(pap$Conference)
pap$Conference[pap$Conference %in% c("CCS-AISEC","FC-BTW","USENIX-CSET")]<-"Workshops"
pap$Conference <- factor(pap$Conference)
pap$yearPast <- (2017-pap$Year)
pap$citeNumYr<-pap$citeNum/pap$yearPast
pap$ExistOrCreate <- as.character(pap$ExistOrCreate)
pap$ExistOrCreate[is.na(pap$ExistOrCreate)]<-"No Data"
pap$ExistOrCreate <- factor(pap$ExistOrCreate)
pap$ExistOrCreate <- factor(pap$ExistOrCreate,levels(pap$ExistOrCreate)[c(3,2,1)])

pap$citeNumYr <- pap$citeNum/pap$yearPast
pap$hasData<-ifelse(pap$Data=="Data",T,F)
pap$PublicCreate<-ifelse(pap$Public=="Yes","Created Public","Created Not Public")
pap$PublicCreate[pap$Public==""]<-" No Data"
pap$PublicCreate[is.na(pap$Public)]<-"Only Existing Data"
pap$PublicCreate<-factor(pap$PublicCreate)

#create balanced sample of papers to do comparisons over time
cy<-as.data.frame(table(pap$Conference,pap$Year))
names(cy)<-c("Conference","Year","NumPapers")

cy2<-cbind(aggregate(NumPapers~Conference,sum,data=cy),aggregate(NumPapers~Conference,max,data=cy))
#CCS 2016,FC 2016, IMC 2012, NDSS 2012, SP 2013, USENIX 2014
cy2<-cy2[c(1,2,3,4,5,6),c(1,2,4)]
names(cy2)<-c("Conference","sumpapers","maxpapers")
cy2$avgpapers<-round((cy2$sumpapers-cy2$maxpapers)/4)
cy2$maxyear<-c(2016,2016,2012,2012,2013,2014)

#first remove the entries from the max year
papsam<-pap
for(i in 1:6) {
    papsam<-papsam[!(papsam$Conference==cy2$Conference[i]&papsam$Year==cy2$maxyear[i]),]
#then reintroduce them by taking samples
pkyr<-pap[pap$Conference==cy2$Conference[i]&pap$Year==cy2$maxyear[i],]
papsam<-rbind(papsam,pkyr[sample(nrow(pkyr),cy2$avgpapers),])
}

dtsam <- merge(papsam,dt,by="Name",all.x=T)
dtsam$Origin2<-as.character(dtsam$Origin)
dtsam$Origin2[is.na(dtsam$Origin2)]<-"No Data"

dtsam$Origin2<-factor(dtsam$Origin2)

dtsamex<-dtsam[!dtsam$CreatedData,]
dtsamcr<-dtsam[dtsam$CreatedData,]


###################################################
# analysis
###################################################


# 1. Dataset usage and production

# summary stats on datasets and papers, broken down by data usage
table(pap$ExistOrCreate)
round(100*prop.table(table(pap$ExistOrCreate)))
#of papers creating data, what % make the data public
table(pap$PublicCreate)
round(100*prop.table(table(pap$PublicCreate)))

#now look at datasets
a<-table(dt$Origin,dt$Public)
b<-round(100*prop.table(table(dt$Origin,dt$Public),1))
pubdt<-data.frame(cbind(a[,1],b[,1],a[,2],b[,2]))
names(pubdt)<-c("Not Public #","Not Public %","Public #","Public %")
write.table(pubdt,"pubdt.txt",sep="&",row.names=T,quote=F)
chisq.test(table(dt$Origin,dt$Public))$stdres


pdf("pctdatabar.pdf")
par(mar=c(4,4,0,1))
barplot(100*prop.table(table(papsam$ExistOrCreate,papsam$Year),2),col=1:3,density = seq(10,40,10),angle=c(30,90,150),lwd=1.5,border=1:3,legend=c("No Dataset", "Used Existing Dataset","Created Dataset"),ylim=c(0,120),axes=F,ylab="% of papers",cex.lab=1.3,cex.axis=1.3)
axis(2,at=c(0,20,40,60,80,100))
dev.off()

papsamcr<-papsam[papsam$ExistOrCreate=="Created",]
papsamex<-papsam[papsam$ExistOrCreate=="Existing",]

prop.table(table(papsamcr$Public,papsamcr$Year),2)
pdf("pctcrpubtime.pdf",height=4,width=7)
par(mar=c(4,6,0.5,0.5))
plot(x=2012:2016,y=100*prop.table(table(papsamcr$Public,papsamcr$Year),2)[3,],type='l',ylim=c(0,35),lwd=2,lty=1,xlab="",ylab="% created datasets\n made public",cex.lab=1.5,cex.axis=1.4)
dev.off()




pdf("timecatall.pdf",height=6,width=6)
par(mar=c(2.5,4,0.5,0.5),mfrow=c(1,1))
barplot(100*prop.table(table(dt$Category,dt$Year),2),col=1:4,density = seq(10,50,10),angle=c(30,90,150,210),lwd=1.5,border=1:4,legend=c("Attacker Related", "Defender Artifacts","Macro-level Internet Characteristics","User and Organizational Characteristics"),ylim=c(0,130),axes=F,ylab="% datasets in category")
axis(2,at=c(0,20,40,60,80,100))
dev.off()



# 2. Subcategories 

#citations per dataset split by subcategory
citesc2<-aggregate(citeNumYr~SubCategory+CreatedData,data=dt,median)
citesc2<-citesc2[order(citesc2$SubCategory),]
citesc2$lab<-ifelse(citesc2$CreatedData,paste(citesc2$SubCategory,"(created)",sep=" "),paste(citesc2$SubCategory,"(existing)",sep=" "))
citesc2$lab[7]<-"Cybercrime Inf. (existing)"
citesc2$lab[8]<-"Cybercrime Inf. (created)"

pdf("citessubcatce.pdf",height=7,width=10)
par(mar=c(11,4.1,1,1))
barplot(citesc2$citeNumYr,names.arg=citesc2$lab,las=3,col=c(1,1,1,1,1,1,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4),border=c(1,1,1,1,1,1,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4),density = c(15,30),angle=c(30,150), ylab="Median # citations per year",cex.lab=1.4,cex.axis=1.25)
dev.off()



#citation measures and regressions
papcr<-pap[pap$ExistOrCreate=="Created",]

dtsub<-unique(dt[,c("Name","Category","SubCategory","Origin")])
dpcr<-merge(papcr,dtsub,by="Name")

dpcr$SubCategoryApp<-dpcr$SubCategory
dpcr$SubCategoryApp<-factor(dpcr$SubCategoryApp,levels(dpcr$SubCategoryApp)[c(7,1,2,3,4,5,6,8,9,10,11,12,13)])

dp<-merge(pap,dtsub,by="Name")
dp$SubCategoryApp<-dp$SubCategory
dp$SubCategoryApp<-factor(dp$SubCategoryApp,levels(dp$SubCategoryApp)[c(7,1,2,3,4,5,6,8,9,10,11,12,13)])

library(stargazer)

cr1<-lm(citeNum~yearPast, data=papcr)
cr2<-lm(citeNum~yearPast+Conference, data=papcr)
cr3<-lm(citeNum~yearPast+Conference+Public, data=papcr)
cr4<-lm(citeNum~yearPast+Conference+Public+SubCategory, data=dpcr)
stargazer(cr1,cr2,cr3,cr4,align=TRUE,single.row=F,no.space=T,omit.stat=c("f"),out="regcr.tex")

#alternative specification is to include all data and treat as categorical variable for Public, publishing data relative to Existing Datasets.
dr1<-lm(citeNum~yearPast, data=pap)
dr2<-lm(citeNum~yearPast+Conference, data=pap)
dr3<-lm(citeNum~yearPast+Conference+PublicCreate, data=pap)
dr4<-lm(citeNum~yearPast+Conference+PublicCreate+SubCategory, data=dp)
stargazer(dr1,dr2,dr3,dr4,align=TRUE,single.row=F,no.space=T,omit.stat=c("f"),out="regdr.tex")

#count incidence of datasets by subcategory, split by whether created and made public
tabcat<-data.frame(cbind(round(100*prop.table(table(dt$SubCategory))),round(100*prop.table(table(dt$SubCategory,dt$Exist),1)),round(100*prop.table(table(dt$SubCategory,dt$Public),1))))

names(tabcat)<-c("pctcat","pctcr","pctex","pctpri","pctpub")
tabcat<-tabcat[,c("pctcat","pctcr","pctpub")]
write.table(tabcat,"tabcat.txt",sep="&",row.names=T,quote=F)

#identify which proportions of created and public are statistically significant
chisq.test(table(dt$SubCategory,dt$Exist))$stdres
chisq.test(table(dt$SubCategory,dt$Public))$stdres

#median number of citations per year for different paper types 
aggregate(citeNumYr~PublicCreate,data=pap,median)
